astra <- read.csv("https://raw.githubusercontent.com/QMSS-G5063-2023/course_content/main/Exercises/03_astronauts%20-%20GRADED/astronauts.csv?token=GHSAT0AAAAAAB557SY67DYJGWWUEAM4MRUIY7M3XOA")
head(astra)
## id number nationwide_number name
## 1 1 1 1 Gagarin, Yuri
## 2 2 2 2 Titov, Gherman
## 3 3 3 1 Glenn, John H., Jr.
## 4 4 3 1 Glenn, John H., Jr.
## 5 5 4 2 Carpenter, M. Scott
## 6 6 5 2 Nikolayev, Andriyan
## original_name sex year_of_birth
## 1 ГÐ\220ГÐ\220Ð Ð\230Ð\235 ЮÑ\200ий Ð\220лекÑ\201еевич male 1934
## 2 ТÐ\230ТОВ ГеÑ\200ман Степанович male 1935
## 3 Glenn, John H., Jr. male 1921
## 4 Glenn, John H., Jr. male 1921
## 5 Carpenter, M. Scott male 1925
## 6 Ð\235Ð\230КОЛÐ\220ЕВ Ð\220ндÑ\200иÑ\217н ГÑ\200игоÑ\200ьевич male 1929
## nationality military_civilian selection year_of_selection
## 1 U.S.S.R/Russia military TsPK-1 1960
## 2 U.S.S.R/Russia military TsPK-1 1960
## 3 U.S. military NASA Astronaut Group 1 1959
## 4 U.S. military NASA Astronaut Group 2 1959
## 5 U.S. military NASA- 1 1959
## 6 U.S.S.R/Russia military TsPK-1 1960
## mission_number total_number_of_missions occupation year_of_mission
## 1 1 1 pilot 1961
## 2 1 1 pilot 1961
## 3 1 2 pilot 1962
## 4 2 2 PSP 1998
## 5 1 1 Pilot 1962
## 6 1 2 pilot 1962
## mission_title ascend_shuttle in_orbit descend_shuttle hours_mission
## 1 Vostok 1 Vostok 1 Vostok 2 Vostok 3 1.77
## 2 Vostok 2 Vostok 2 Vostok 2 Vostok 2 25.00
## 3 MA-6 MA-6 MA-6 MA-6 5.00
## 4 STS-95 STS-95 STS-95 STS-95 213.00
## 5 Mercury-Atlas 7 Mercury-Atlas 7 Mercury-Atlas 7 Mercury-Atlas 7 5.00
## 6 Vostok 3 Vostok 3 Vostok 3 Vostok 3 94.00
## total_hrs_sum eva_instances eva_hrs_mission total_eva_hrs
## 1 1.77 0 0 0
## 2 25.30 0 0 0
## 3 218.00 0 0 0
## 4 218.00 0 0 0
## 5 5.00 0 0 0
## 6 519.33 0 0 0
astra$age_of_selection <- astra$year_of_selection - astra$year_of_birth
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.3
# Scatter plot
ggplot(astra, aes(x = year_of_birth, y = age_of_selection, color=sex)) +
geom_point(alpha=0.5) +
labs(x="Year of Birth", y="Age of Selection", color="sex") +
theme(legend.position="right") +
ggtitle("Age of selection by years of birth: gender difference")
# Few female pilots
# Female astraunauts began to be selected from 1940's birth cohort
# Age range differ by gender
# Box plot (show age distribution for each birth cohort, by gender)
astra <- within(astra, {
birth_cohort <- NA
birth_cohort[year_of_birth<1930] <- "Before 1920's"
birth_cohort[year_of_birth>=1930 & year_of_birth <1940] <- "1930's"
birth_cohort[year_of_birth>=1940 & year_of_birth < 1950] <- "1940's"
birth_cohort[year_of_birth>=1950 & year_of_birth < 1960] <- "1950's"
birth_cohort[year_of_birth>=1960 & year_of_birth < 1970] <- "1960's"
birth_cohort[year_of_birth>=1970 ] <- "1970's and after"
})
ggplot(astra, aes(x=reorder(birth_cohort, year_of_birth, na.ra=TRUE), y=age_of_selection)) +
geom_boxplot(aes(fill=sex), outlier.colour = "transparent",
alpha = 0.3) + #<< fill in different colors by "sex"
coord_flip() + labs(x="Year of Birth", y="Age of Selection") +
theme(legend.position="right") +
ggtitle("Age of selection by birth cohorts: gender difference")
# Few female pilots
# Female astraunauts began to be selected not until 1940's birth cohort
# Age range differ by gender
ggplot(astra, aes(x=reorder(birth_cohort, year_of_birth, na.ra=TRUE), y=age_of_selection)) +
geom_jitter(shape=21, aes(fill=sex), color="transparent",
position = position_jitter(w=0.1)) +
coord_flip() + labs(x="", y="age of selection") +
theme(legend.position="right") +
ggtitle("Age of selection by years of birth: gender difference")
# Few female pilots
# Female astraunauts began to be selected from 1940's birth cohort
# Age range differ by gender
The scatter chart and the jitter chart both shows that (1) there are fewer female astranauts, (2) female astranauts took part in the space mission beginning from 1930’s birth cohort, and (3) the age range is more restrictive for females.
I would recommend to use the jitter chart because it shows the gender and age distribution of the astraunauts by birth cohorts more intuitively.
I considered the principles of similarities (using consistent color for all male and female) and proximity (used the distance between data points to group/distinguish birth cohorts). I tried to make the graph simple by not adding box plot to the graph since the proximity of data points show the age range.
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
astra <- astra %>%
group_by(nationality) %>%
mutate(year_of_first_mission = min(year_of_mission))
astra <- astra %>%
group_by(mission_title) %>%
mutate(joint_mission = as.integer(n_distinct(nationality) > 1))
summarize(astra, id, nationality, year_of_first_mission, mission_title, joint_mission, eva_instances, eva_hrs_mission, total_eva_hrs)
## `summarise()` has grouped output by 'mission_title'. You can override using the
## `.groups` argument.
## # A tibble: 1,277 x 8
## # Groups: mission_title [362]
## mission_title id nationality year_of_first_m~ joint_mission eva_instances
## <chr> <int> <chr> <int> <int> <int>
## 1 1 205 U.S.S.R/Rus~ 1961 1 2
## 2 1 284 U.S.S.R/Rus~ 1961 1 2
## 3 1 434 U.S.S.R/Rus~ 1961 1 0
## 4 1 438 Syria 1987 1 0
## 5 1 455 U.S.S.R/Rus~ 1961 1 0
## 6 1 462 U.S. 1962 1 0
## 7 1 815 U.S.S.R/Rus~ 1961 1 0
## 8 10 386 U.S.S.R/Rus~ 1961 1 1
## 9 10 587 U.S.S.R/Rus~ 1961 1 0
## 10 10 758 U.S. 1962 1 2
## # ... with 1,267 more rows, and 2 more variables: eva_hrs_mission <dbl>,
## # total_eva_hrs <dbl>
ggplot(astra, aes(x = reorder(nationality, -year_of_first_mission), y = year_of_mission, color=joint_mission)) +
geom_point(alpha=0.5) +
coord_flip() +
labs(x="", y="Year of missions", color="joint mission") +
theme(legend.position="right") +
ggtitle("World-wide history of space missions: astronauts across countries")
library(dplyr)
agg <- astra %>%
group_by(name) %>%
summarize('num_of_eva'=sum(eva_instances),
'total_eva'=sum(total_eva_hrs))
summarize(agg, name, num_of_eva, total_eva)
## # A tibble: 564 x 3
## name num_of_eva total_eva
## <chr> <int> <dbl>
## 1 Acaba, Joseph M. 3 59.8
## 2 Acton, Loren Wilbur 0 0
## 3 Adamson, James C. 0 0
## 4 Afanasyev, Viktor Mikhaylovich 7 152.
## 5 Aidyn (Aydyn) Akanovich Aimbetov 0 0
## 6 Akers, Thomas D. 4 121.
## 7 Akiyama, Toyohiro 0 0
## 8 Aksyonov, Vladimir 0 0
## 9 Al-saud, Sultan bin Salman 0 0
## 10 Al Mansoori, Hazzaa 0 0
## # ... with 554 more rows
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.1.3
dplot <- ggplot(agg, aes(x=num_of_eva, y=total_eva)) +
geom_bar(stat="summary", fill="steelblue")+
labs(x="The number of EVA", y="Total hours of EVA")+
ggtitle("The relationship between the number of EVA and total hours of EVA")
dplot
## No summary function supplied, defaulting to `mean_se()`
# Whether the mission lasted over 30 days
astra$success <- as.numeric(astra$hours_mission > 720)
head(astra)
## # A tibble: 6 x 29
## # Groups: mission_title [6]
## id number nationwide_number name original_name sex year_of_birth
## <int> <int> <int> <chr> <chr> <chr> <int>
## 1 1 1 1 Gagarin, Yuri "ГÐ\u0090Г~ male 1934
## 2 2 2 2 Titov, Gherm~ "ТИТОВ ~ male 1935
## 3 3 3 1 Glenn, John ~ "Glenn, John~ male 1921
## 4 4 3 1 Glenn, John ~ "Glenn, John~ male 1921
## 5 5 4 2 Carpenter, M~ "Carpenter, ~ male 1925
## 6 6 5 2 Nikolayev, A~ "Ð\u009dИК~ male 1929
## # ... with 22 more variables: nationality <chr>, military_civilian <chr>,
## # selection <chr>, year_of_selection <int>, mission_number <int>,
## # total_number_of_missions <int>, occupation <chr>, year_of_mission <int>,
## # mission_title <chr>, ascend_shuttle <chr>, in_orbit <chr>,
## # descend_shuttle <chr>, hours_mission <dbl>, total_hrs_sum <dbl>,
## # eva_instances <int>, eva_hrs_mission <dbl>, total_eva_hrs <dbl>,
## # age_of_selection <int>, birth_cohort <chr>, ...
astra$yearbin <- cut(astra$year_of_mission,
breaks = c(1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020),
labels = c('1910s', '1920s', '1930s', '1940s', '1950s', '1960s', '1970s', '1980s', '1990s', '2000s', '2010s'))
bargraph <- ggplot(astra, aes(x=yearbin, y=success, color = sex))+
geom_bar(stat = "summary", fun = "mean", fill = "grey", position="dodge") +
labs(x="", y="% of mission lasted over 30 days")
bargraph
missionhrs <- astra %>%
group_by(year_of_mission, sex) %>%
summarize('avghours_mission'=mean(hours_mission),
'minhours_mission'=min(hours_mission),
'maxhours_mission'=max(hours_mission), na.ra=T)
## `summarise()` has grouped output by 'year_of_mission'. You can override using
## the `.groups` argument.
dotlineplot <- ggplot(missionhrs, aes(x=year_of_mission, y=avghours_mission, color = sex))+
geom_line() + geom_point(size=3) +
labs(x="", y="Average hours of space missions") +
ggtitle("Average hours of space missions over time (1960-2020)")
dotlineplot
* The bar graph charts how the percentage of missions whose duration
lasted over 30 days(700 hours) changed over every decades, by sex of the
astranauts who participated in the missions. The dotted line graph also
shows the annual trend of average hours of space missions by sex. Both
graphs reveals the interesting pattern that 1) the duration of missions
increased over time, particularly in the last decade, and 2) the gender
difference in the duration of mission disappeared, and even got reversed
in 2010’s. I recommend the dotted line chart as it represents the
overall time trends and changes in gender difference more
intuitively.
library(plotly)
## Warning: package 'plotly' was built under R version 4.1.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
dotlineplot2 <- ggplot(missionhrs, aes(x=year_of_mission, y=avghours_mission, color = sex, label = minhours_mission))+
geom_line() + geom_point(size=3) +
labs(x="", y="Average hours of space missions") +
ggtitle("Trend in average hourse of space mission")
ggplotly(dotlineplot2,
tooltip = c("year_of_mission", "avghours_mission", "minhours_mission"))
boxplot <- plot_ly(astra, y=~age_of_selection, x =~reorder(birth_cohort, year_of_birth), color = ~sex, type = "box")
m <- list(l = 100, r = 100, b = 100, t = 10, pad = 10) #<<
boxplot <- boxplot %>% layout(autosize = F, width = 900, height = 400,
boxmode="group",
margin = m,
yaxis = list(title = "Age of Selection", showticklabels =T),
xaxis = list(title = "Birth cohort"),
title = 'Age of selection by birth cohorts'
)
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()
boxplot
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning: 'layout' objects don't have these attributes: 'boxmode'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'
astronaut <- astra %>%
group_by(name) %>%
summarize('total_number_of_missions'=max(total_number_of_missions),
'total_hours_mission'=max(total_hrs_sum),
'total_eva_hrs'=max(total_eva_hrs))
library("DT")
## Warning: package 'DT' was built under R version 4.1.3
# Fixing Column Labels
library(stringr)
pretty_headers <-
gsub("[.]", " ", colnames(astronaut)) %>%
str_to_title()
# Adding Selection Fields and Sliders
astronaut %>%
datatable(
rownames = FALSE,
colnames = pretty_headers,
filter = list(position = "top"),
options = list(
dom = "Bfrtip",
buttons = I("colvis"),
language = list(sSearch = "Filter:")
),
extensions = c("Buttons", "Responsive")
)